{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Text Classification\n",
    "*Complete and hand in this completed worksheet (including its outputs and any supporting code outside of the worksheet) with your assignment submission. Please check the pdf file for more details.*\n",
    "\n",
    "In this exercise you will:\n",
    "    \n",
    "- implement a of spam classifier with **Naive Bayes method** for real world email messages\n",
    "- learn the **training and testing phase** for Naive Bayes classifier  \n",
    "- get an idea of the **precision-recall** tradeoff"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# some basic imports\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import scipy.sparse\n",
    "%matplotlib inline\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\scipy\\sparse\\compressed.py:746: SparseEfficiencyWarning: Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.\n",
      "  SparseEfficiencyWarning)\n"
     ]
    }
   ],
   "source": [
    "# ham_train contains the occurrences of each word in ham emails. 1-by-N vector\n",
    "ham_train = np.loadtxt('ham_train.csv', dtype=np.int64 , delimiter=',')\n",
    "# spam_train contains the occurrences of each word in spam emails. 1-by-N vector\n",
    "spam_train = np.loadtxt('spam_train.csv', dtype=np.int64 , delimiter=',')\n",
    "# N is the size of vocabulary. 77386 \n",
    "N = ham_train.shape[0]\n",
    "# There 9034 ham emails and 3372 spam emails in the training samples\n",
    "num_ham_train = 9034\n",
    "num_spam_train = 3372\n",
    "# Do smoothing\n",
    "x = np.vstack([ham_train, spam_train]) + 1\n",
    "\n",
    "# ham_test contains the occurences of each word in each ham test email. P-by-N vector, with P is number of ham test emails.\n",
    "i,j,ham_test = np.loadtxt('ham_test.txt' , dtype=np.int64).T\n",
    "ham_test_tight = scipy.sparse.coo_matrix((ham_test, (i - 1, j - 1)))\n",
    "ham_test = scipy.sparse.csr_matrix((ham_test_tight.shape[0], ham_train.shape[0]))\n",
    "ham_test[:, 0:ham_test_tight.shape[1]] = ham_test_tight\n",
    "# spam_test contains the occurences of each word in each spam test email. Q-by-N vector, with Q is number of spam test emails.\n",
    "i,j,spam_test = np.loadtxt('spam_test.txt' , dtype=np.int64).T\n",
    "spam_test_tight = scipy.sparse.csr_matrix((spam_test, (i - 1, j - 1)))\n",
    "spam_test = scipy.sparse.csr_matrix((spam_test_tight.shape[0], spam_train.shape[0]))\n",
    "spam_test[:, 0:spam_test_tight.shape[1]] = spam_test_tight"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Now let's implement a ham/spam email classifier. Please refer to the PDF file for details"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[52913 27709  2571 23471 23631  8058 22602 33609 35796 60120]\n",
      "3011 28\n",
      "1124 31\n"
     ]
    }
   ],
   "source": [
    "from likelihood import likelihood\n",
    "# TODO\n",
    "# Implement a ham/spam email classifier, and calculate the accuracy of your classifier\n",
    "\n",
    "# begin answer\n",
    "l = likelihood(x)\n",
    "ratio_spam_ham = l[1] / l[0]\n",
    "ratio_spam_ham_sort = np.argsort(ratio_spam_ham)\n",
    "print(ratio_spam_ham_sort[0:10])\n",
    "log_likelihood_T = np.log(l).T\n",
    "\n",
    "prior = np.zeros(2)\n",
    "prior[0] = num_ham_train / (num_ham_train + num_spam_train)\n",
    "prior[1] = num_spam_train / (num_ham_train + num_spam_train)\n",
    "\n",
    "ham_test_classification = ham_test * log_likelihood_T + np.log(prior)\n",
    "ham_test_classification_result = [0 if a > 0 else 1 for a in  ham_test_classification[:,0] - ham_test_classification[:,1]]\n",
    "\n",
    "print(ham_test.shape[0],sum(ham_test_classification_result))\n",
    "\n",
    "spam_test_classification = spam_test * log_likelihood_T + np.log(prior)\n",
    "spam_test_classification_result = [1 if a > 0 else 0 for a in  spam_test_classification[:,0] - spam_test_classification[:,1]]\n",
    "print(spam_test.shape[0],sum(spam_test_classification_result))\n",
    "# end answer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
